New chunk Ctrl+Alt+I
Execute chunk Ctrl+Shift+Enter
Execute all chunks Ctrl+Alt+R
HTML preview Ctrl+Shift+K
library(readr)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(reshape2)
library(stats)
data <- read.csv("~/4year/2semester/dtII/CSVs/HEIs.csv",
colClasses = c(tweet_id = "character"))
# Modifying created_at type so that attribute can be used more easily
data$created_at <- as.POSIXct(data$created_at,
format= "%Y-%m-%dT%H:%M:%S", tz="UTC")
#View(data)
summary(data)
id tweet_id text type bookmark_count favorite_count retweet_count reply_count
Length:11728 Length:11728 Length:11728 Length:11728 Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 0.000
Class :character Class :character Class :character Class :character 1st Qu.: 0.000 1st Qu.: 7.00 1st Qu.: 2.00 1st Qu.: 0.000
Mode :character Mode :character Mode :character Mode :character Median : 0.000 Median : 20.00 Median : 5.00 Median : 1.000
Mean : 1.543 Mean : 60.67 Mean : 10.62 Mean : 3.888
3rd Qu.: 1.000 3rd Qu.: 57.00 3rd Qu.: 11.00 3rd Qu.: 3.000
Max. :418.000 Max. :41655.00 Max. :4214.00 Max. :2317.000
view_count created_at hashtags urls media_type media_urls
Min. : 5 Min. :2022-08-01 03:05:11.00 Length:11728 Length:11728 Length:11728 Length:11728
1st Qu.: 2643 1st Qu.:2022-10-19 12:56:27.00 Class :character Class :character Class :character Class :character
Median : 6240 Median :2023-01-29 08:26:30.00 Mode :character Mode :character Mode :character Mode :character
Mean : 14182 Mean :2023-01-30 07:39:34.96
3rd Qu.: 16058 3rd Qu.:2023-05-05 14:16:43.25
Max. :7604544 Max. :2023-08-31 20:50:01.00
NA's :4840
# Count of how many entries each HEI has
number_interactions <- data %>%
group_by(id) %>% summarise(count = n())
number_interactions
data <- data[data$id != "complutense.csv", ]
number_posts <- data %>%
group_by(id) %>% summarise(count = n())
number_tweets <- data[data$type == "Tweet", ] %>%
group_by(id) %>% summarise(count = n())
number_replies <- data[data$type == "Reply", ] %>%
group_by(id) %>% summarise(count = n())
print(number_posts)
print(number_tweets)
print(number_replies)
# Merging the counts of tweets (count.y) and replies (count) with the count of posts (count.x)
data_ratio <- merge(number_posts, number_tweets, by = "id", all = TRUE)
data_ratio <- merge(data_ratio, number_replies, by = "id", all = TRUE)
data_ratio$percentage_tweets <- (data_ratio$count.y / data_ratio$count.x) * 100
data_ratio$percentage_replies <- (data_ratio$count / data_ratio$count.x) * 100
data_ratio <- data_ratio[, c("id", "percentage_tweets", "percentage_replies")]
print(data_ratio)
na_count <- function(){
# Counting the number of NA values for each column
na_count <- colSums(is.na(data))
# Creating a new data frame with the NA counts
na_counts_table <- data.frame(Column = names(na_count), NA_Count = na_count)
print(na_counts_table)
}
data <- data %>%
group_by(id) %>%
mutate(view_percentile = ntile(view_count, 100),
favorite_percentile = ntile(favorite_count, 100),
retweet_percentile = ntile(retweet_count, 100),
reply_percentile = ntile(reply_count, 100)) %>%
rowwise() %>%
mutate(avg_percentile = mean(c(view_percentile, favorite_percentile, retweet_percentile, reply_percentile), na.rm = TRUE))
na_count()
max_view_counts <- tapply(data$view_count, data$id, max, na.rm = TRUE)
# From view count
data$view_count <- ifelse(
is.na(data$view_count),
round(max_view_counts[data$id] * (data$avg_percentile / 100)),
data$view_count)
# From view percentile
data$view_percentile <- ifelse(
is.na(data$view_percentile),
data$avg_percentile,
data$view_percentile)
na_count()
data_tweets <- data[data$type == "Tweet", ]
data_tweets
average_tweets <- function(timeframe = "days"){
# Calculation of the timeframe between earliest and latest post for each HEI
date_range <- data_tweets %>%
group_by(id) %>%
summarise(min_date = min(created_at),
max_date = max(created_at)) %>%
mutate(num_days = as.numeric(difftime(max_date, min_date, units = timeframe)))
# Naming the column respecting the timeframe
column_name <- paste0("avg_tweets_per_", timeframe)
# Calculation of the number of tweets per day for each HEI
tweets_per_timeframe <- number_tweets %>%
left_join(date_range, by = "id") %>%
mutate(!!column_name := count / num_days)
print(tweets_per_timeframe)
return(tweets_per_timeframe)
}
tweets_per_day <- average_tweets()
tweets_per_week <- average_tweets(timeframe = "weeks")
barplot(tweets_per_day$avg_tweets_per_days,
names.arg = tweets_per_day$id,
main = "Average Tweets per Day",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(tweets_per_day$avg_tweets_per_days) + 1),
las = 2,
col = "#3498DB")
# Adding text labels over each bar and aligning it with the center of each bar
text(x = barplot(tweets_per_day$avg_tweets_per_days, plot = FALSE),
y = tweets_per_day$avg_tweets_per_days,
labels = round(tweets_per_day$avg_tweets_per_days, 2),
pos = 3)
barplot(tweets_per_week$avg_tweets_per_weeks,
names.arg = tweets_per_week$id,
main = "Average Tweets per Week",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(tweets_per_week$avg_tweets_per_weeks) + 5),
las = 2,
col = "#E74C3C")
text(x = barplot(tweets_per_week$avg_tweets_per_weeks, plot = FALSE),
y = tweets_per_week$avg_tweets_per_weeks,
labels = round(tweets_per_week$avg_tweets_per_weeks, 2),
pos = 3)
intervals <- list(
interval1 = as.POSIXct(c("2022-08-31", "2022-12-15")),
interval2 = as.POSIXct(c("2023-01-04", "2023-04-01")),
interval3 = as.POSIXct(c("2023-04-14", "2023-06-15"))
)
check_interval <- function(date) {
for (i in 1:length(intervals)) {
interval_start <- intervals[[i]][1]
interval_end <- intervals[[i]][2]
if (date >= interval_start & date <= interval_end) {
return(TRUE)
}
}
return(FALSE)
}
data_tweets$academic_year <- sapply(data_tweets$created_at, check_interval)
print(data.frame(id = data_tweets$id, academic_year = data_tweets$academic_year))
barplot(table(data_tweets$academic_year),
main = "Number of Tweets per Timeframe",
xlab = "Time",
ylab = "Count",
ylim = c(0, max(table(data_tweets$academic_year)) + 1000),
names.arg = c("Vacation", "Academic"),
col = c("#8E44AD", "#F1C40F"))
text(x = barplot(data_tweets$academic_year, plot = FALSE),
y = table(data_tweets$academic_year) + 0.5,
labels = table(data_tweets$academic_year),
pos = 3)
analyze_tweets <- function(academic_year_filter = TRUE) {
# Filtering the data based on the academic_year_filter
filtered_data <- data_tweets %>%
filter(academic_year == academic_year_filter)
# Count of days for each HEI
unique_days <- filtered_data %>%
group_by(id) %>%
summarise(unique_days = n_distinct(as.Date(created_at)))
# Count of tweets for each HEI
number_tweets_boolean <- filtered_data %>%
group_by(id) %>%
summarise(count = n())
# Naming the column respecting the time period
year <- ifelse(academic_year_filter, "academic_time", "vacation_time")
column_name <- paste0("avg_tweets_in_", year)
# Combination of data and calculation of average posts per day
combined_data <- left_join(unique_days, number_tweets_boolean, by = "id")
combined_data <- combined_data %>%
mutate(!!column_name := count / unique_days)
print(combined_data)
return(combined_data)
}
data_tweets_academic <- analyze_tweets()
data_tweets_vacations <- analyze_tweets(academic_year_filter = FALSE)
barplot(data_tweets_academic$avg_tweets_in_academic_time,
names.arg = data_tweets_academic$id,
main = "Average Tweets during Academic Time",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(data_tweets_academic$avg_tweets_in_academic_time) + 5),
las = 2,
col = "#34495E")
text(x = barplot(data_tweets_academic$avg_tweets_in_academic_time, plot = FALSE),
y = data_tweets_academic$avg_tweets_in_academic_time,
labels = round(data_tweets_academic$avg_tweets_in_academic_time, 2),
pos = 3)
barplot(data_tweets_vacations$avg_tweets_in_vacation_time,
names.arg = data_tweets_vacations$id,
main = "Average Tweets during Vacation Time",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(data_tweets_vacations$avg_tweets_in_vacation_time) + 5),
las = 2,
col = "#D35400")
text(x = barplot(data_tweets_vacations$avg_tweets_in_vacation_time, plot = FALSE),
y = data_tweets_vacations$avg_tweets_in_vacation_time,
labels = round(data_tweets_vacations$avg_tweets_in_vacation_time, 2),
pos = 3)
# Creating new table that contains a new column for the day of the week
data_tweets_days <- data_tweets %>%
mutate(day_of_week = weekdays(created_at))
# Selecting only the id, created_at, and day_of_week columns for the new table
data_tweets_days <- data_tweets_days %>%
select(id, created_at, day_of_week)
print(data_tweets_days)
# Grouping by id and day_of_week, then counting the number of tweets
number_tweets_days <- data_tweets_days %>%
group_by(id, day_of_week) %>%
summarise(count = n())
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
# Grouping by id, day_of_week and day created at, then counting the number of tweets
number_tweets_per_day <- data_tweets_days %>%
mutate(created_date = as.Date(created_at)) %>%
group_by(id, day_of_week, created_date) %>%
summarise(count = n())
`summarise()` has grouped output by 'id', 'day_of_week'. You can override using the `.groups` argument.
# Finding for each HEI the average count of tweets per day
average_number_tweets_per_day <- number_tweets_per_day %>%
group_by(id, day_of_week) %>%
summarise(average_count = mean(count))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(number_tweets_days)
# Finding the HEI with the lowest count of tweets per day
lowest_count <- number_tweets_days %>%
group_by(day_of_week) %>%
slice_min(order_by = count) %>%
select(day_of_week, id, count)
# Finding the HEI with the highest count of tweets per day
highest_count <- number_tweets_days %>%
group_by(day_of_week) %>%
slice_max(order_by = count) %>%
select(day_of_week, id, count)
# Combine the results
high_low_HEI <- bind_rows(lowest_count, highest_count) %>%
arrange(day_of_week)
print(high_low_HEI)
ggplot(high_low_HEI, aes(x = day_of_week, y = count, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = count),
position = position_dodge(width = 0.9),
vjust = -0.5,
size = 3) +
labs(title = "Lowest and Highest Count of Tweets per Day for Each Day of the Week",
x = "Day of the Week", y = "Count") +
scale_fill_manual(values = rainbow(length(unique(high_low_HEI$id)))) +
theme_minimal() +
theme(legend.title = element_blank())
# Finding the HEI with lowest and highest averaged count of tweets per day
high_low_average_HEIs <- average_number_tweets_per_day %>%
group_by(day_of_week) %>%
filter(average_count == max(average_count) | average_count == min(average_count)) %>%
arrange(day_of_week, ifelse(average_count == min(average_count), average_count, -average_count))
print(high_low_average_HEIs)
ggplot(high_low_average_HEIs, aes(x = day_of_week, y = average_count, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(average_count, 2)),
position = position_dodge(width = 0.7),
vjust = -0.5,
size = 3) +
labs(title = "Highest and Lowest Average Count of Tweets per Day for Each Day of the Week",
x = "Day of the Week", y = "Average Count") +
scale_fill_manual(values = rainbow(length(unique(high_low_HEI$id)))) +
theme_minimal() +
theme(legend.title = element_blank())
# Table containing views, likes, retweets and replies for each media type for each HEI
types_of_tweets <- data_tweets %>%
group_by(id, media_type) %>%
summarise(count = n(),
views = sum(view_count, na.rm = TRUE),
likes = sum(favorite_count, na.rm = TRUE),
retweets = sum(retweet_count, na.rm = TRUE),
replies = sum(reply_count, na.rm = TRUE))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(types_of_tweets)
# Grouping by HEI and calculating the total values of views, likes and replies across all media types
total_tweets_stats <- types_of_tweets %>%
group_by(id) %>%
summarise(total_views = sum(views),
total_likes = sum(likes),
total_replies = sum(replies))
print(total_tweets_stats)
pie_maker <- function(target_id = "duke.csv"){
# Filtering data for the specific HEI
hei_data <- types_of_tweets %>%
filter(id == target_id)
# Calculating total views for each media type for the specific HEI
hei_media <- hei_data %>%
group_by(media_type) %>%
summarise(total_views = sum(views),
total_likes = sum(likes),
total_replies = sum(replies))
# Calculating the percentage of views for each media type for the specific HEI
hei_media$percentage_view <- hei_media$total_views / sum(hei_media$total_views) * 100
hei_media$percentage_like <- hei_media$total_likes / sum(hei_media$total_likes) * 100
hei_media$percentage_reply <- hei_media$total_replies / sum(hei_media$total_replies) * 100
# Creating the pie chart for views
hei_pie_chart_views <- ggplot(hei_media, aes(x = "", y = percentage_view, fill = media_type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
theme(legend.position = "right") +
geom_text(aes(label = paste(media_type, "\n", total_views, "(", round(percentage_view, 1), "%)")), position = position_stack(vjust = 0.5), color = "#FFFFFF") +
scale_fill_manual(values = c("no_media" = "#2196F3", "animated_gif" = "#E67E22", "photo" = "#8E44AD", "video" = "#138D75")) +
labs(title = paste("Views for each media type -", target_id))
# Creating the pie chart for likes
hei_pie_chart_likes <- ggplot(hei_media, aes(x = "", y = percentage_like, fill = media_type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
theme(legend.position = "right") +
geom_text(aes(label = paste(media_type, "\n", total_likes, "(", round(percentage_like, 1), "%)")), position = position_stack(vjust = 0.5), color = "#FFFFFF") +
scale_fill_manual(values = c("no_media" = "#E91E63", "animated_gif" = "#4A148C", "photo" = "#90CAF9", "video" = "#00BFA5")) +
labs(title = paste("Likes for each media type -", target_id))
# Creating the pie chart for replies
hei_pie_chart_replies <- ggplot(hei_media, aes(x = "", y = percentage_reply, fill = media_type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
theme(legend.position = "right") +
geom_text(aes(label = paste(media_type, "\n", total_replies, "(", round(percentage_reply, 1), "%)")), position = position_stack(vjust = 0.5), color = "#FFFFFF") +
scale_fill_manual(values = c("no_media" = "#666600", "animated_gif" = "#99CCCC", "photo" = "#9966CC", "video" = "#330000")) +
labs(title = paste("Replies for each media type -", target_id))
# Print the pie charts
print(hei_pie_chart_views)
print(hei_pie_chart_likes)
print(hei_pie_chart_replies)
}
pie_maker()
pie_maker("epfl.csv")
pie_maker("goe.csv")
pie_maker("harvard.csv")
pie_maker("leicester.csv")
pie_maker("manchester.csv")
pie_maker("mit.csv")
pie_maker("sb.csv")
pie_maker("stanford.csv")
pie_maker("trinity.csv")
pie_maker("wv.csv")
pie_maker("yale.csv")
# Calculation of like_ratio and replies_ratio percentages
ratios_tweets_table <- total_tweets_stats %>%
mutate(like_ratio = total_likes / total_views * 100,
replies_ratio = total_replies / total_views * 100)
# Creation of new table with each HEI, like_ratio, and replies_ratio
hei_tweets_ratios <- ratios_tweets_table %>%
select(id, like_ratio, replies_ratio) %>%
distinct()
print(hei_tweets_ratios)
ggplot(hei_tweets_ratios, aes(x = id)) +
geom_bar(aes(y = like_ratio, fill = "Like Ratio"), stat = "identity", position = "dodge") +
geom_bar(aes(y = replies_ratio, fill = "Replys Ratio"), stat = "identity", position = "dodge") +
geom_text(aes(y = like_ratio, label = round(like_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#000000") +
geom_text(aes(y = replies_ratio, label = round(replies_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#FFFFFF") +
labs(title = "Like and Replys Ratios by HEI",
x = "HEI",
y = "Ratio (%)",
fill = "Metric") +
scale_fill_manual(values = c("Like Ratio" = "#2196F3", "Replys Ratio" = "#F44336")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# Table with averages of views, likes, retweets and replies
types_of_tweets_per_tweet <- types_of_tweets %>%
group_by(id, media_type) %>%
summarise(avg_views = mean(views / count),
avg_likes = mean(likes / count),
avg_retweets = mean(retweets / count),
avg_replies = mean(replies / count))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(types_of_tweets_per_tweet)
# Grouping by HEI and calculating the average values of views, likes and replies across all media types
total_average_stats <- types_of_tweets_per_tweet %>%
group_by(id) %>%
summarise(avg_views = sum(avg_views),
avg_likes = sum(avg_likes),
avg_replies = sum(avg_replies))
print(total_average_stats)
# Calculation of like_ratio and replies_ratio percentages
ratios_average_table <- total_average_stats %>%
mutate(like_ratio = avg_likes / avg_views * 100,
replies_ratio = avg_replies / avg_views * 100)
# Creation of new table with each HEI, like_ratio, and replies_ratio
hei_average_ratios <- ratios_average_table %>%
select(id, like_ratio, replies_ratio) %>%
distinct()
print(hei_average_ratios)
ggplot(hei_average_ratios, aes(x = id)) +
geom_bar(aes(y = like_ratio, fill = "Like Ratio"), stat = "identity", position = "dodge") +
geom_bar(aes(y = replies_ratio, fill = "Replies Ratio"), stat = "identity", position = "dodge") +
geom_text(aes(y = like_ratio, label = round(like_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#000000") +
geom_text(aes(y = replies_ratio, label = round(replies_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#FFFFFF") +
labs(title = "Like and Replies Ratios by HEI",
x = "HEI",
y = "Ratio (%)",
fill = "Metric") +
scale_fill_manual(values = c("Like Ratio" = "#330066", "Replies Ratio" = "#FF6666")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# Create column hour from created_at
data_tweets_days$created_hour <- as.numeric(format(data_tweets_days$created_at, "%H"))
heatmap_maker <- function(target_id = "duke.csv"){
# Filtering data for the specific HEI
target_data <- data_tweets_days %>%
filter(id == target_id)
# Grouping by day of the week and hour, and counting the number of tweets
tweet_counts <- target_data %>%
group_by(day_of_week, created_hour) %>%
summarise(num_tweets = n())
# Plotting heatmap
ggplot(tweet_counts, aes(x = day_of_week, y = created_hour, fill = num_tweets)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Tweet Heatmap for", target_id),
x = "Day of the week",
y = "Hour of the day")
}
heatmap_maker()
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("epfl.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("goe.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("harvard.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("leicester.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("manchester.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("mit.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("sb.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("stanford.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("trinity.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("wv.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("yale.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
data_tweets_content <- data_tweets %>%
select(id, text)
# Counting number of words
data_tweets_content <- data_tweets_content %>%
mutate(num_words = lengths(strsplit(text, "\\s+")))
print(data_tweets_content)
# Grouping by HEI and calculate average, minimum, and maximum values of number of words
data_tweets_content_metrics <- data_tweets_content %>%
group_by(id) %>%
summarise(average_num_words = mean(num_words),
min_num_words = min(num_words),
max_num_words = max(num_words))
print(data_tweets_content_metrics)
ggplot(data_tweets_content_metrics, aes(x = id, y = average_num_words)) +
geom_point(aes(color = "Average")) +
geom_errorbar(aes(ymin = min_num_words, ymax = max_num_words, color = "Range"), width = 0.2) +
scale_color_manual(values = c("Average" = "#1976D2", "Range" = "#EF5350")) +
labs(title = "Word Count Summary by HEI",
x = "HEI",
y = "Number of Words",
color = "Metric") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
data_replies <- data[data$type == "Reply", ]
data_replies
# Table containing views, likes, retweets and replies for each media type for each HEI
types_of_replies <- data_replies %>%
group_by(id, media_type) %>%
summarise(count = n(),
views = sum(view_count, na.rm = TRUE),
likes = sum(favorite_count, na.rm = TRUE),
retweets = sum(retweet_count, na.rm = TRUE),
replies = sum(reply_count, na.rm = TRUE))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(types_of_replies)
# Grouping by HEI and calculating the total values of views, likes and replies across all media types
total_replies_stats <- types_of_replies %>%
group_by(id) %>%
summarise(total_views = sum(views),
total_likes = sum(likes),
total_replies = sum(replies))
print(total_replies_stats)
# Calculation of like_ratio and replies_ratio percentages
ratios_replies_table <- total_replies_stats %>%
mutate(like_ratio = total_likes / total_views * 100,
replies_ratio = total_replies / total_views * 100)
# Creation of new table with each HEI, like_ratio, and replies_ratio
hei_replies_ratios <- ratios_replies_table %>%
select(id, like_ratio, replies_ratio) %>%
distinct()
print(hei_replies_ratios)
# Creating table for cluster algorithms
# Joining attribute count (number of tweets) and unique_hashtags (number of unique hashtags) per HEI
cluster_table <- merge(select(unique_hashtags, id, unique_hashtags), select(number_tweets, id, count), by = "id", all=TRUE)
# Joining attribute avg_tweets_per_days (average of tweets per day) per HEI
cluster_table <- merge(cluster_table, select(tweets_per_day, id, avg_tweets_per_days), by = "id", all=TRUE)
# Joining attribute avg_tweets_per_weeks (average of tweets per week) per HEI
cluster_table <- merge(cluster_table, select(tweets_per_week, id, avg_tweets_per_weeks), by = "id", all=TRUE)
# Joining attribute avg_tweets_in_academic_time (average of tweets during academic time) per HEI
cluster_table <- merge(cluster_table, select(data_tweets_academic, id, avg_tweets_in_academic_time), by = "id", all=TRUE)
# Joining attribute avg_tweets_in_vacation_time (average of tweets during vacation time) per HEI
cluster_table <- merge(cluster_table, select(data_tweets_vacations, id, avg_tweets_in_vacation_time), by = "id", all=TRUE)
# Joining attribute total_views (total number of views), total_likes (total number of likes) and total_replies (total number of replies) per HEI
cluster_table <- merge(cluster_table, select(total_tweets_stats, id, total_views, total_likes, total_replies), by = "id", all=TRUE)
# Renaming attribute like_ratio to total_like_ratio and replies_ratio to total_replies_ratio
cluster_table <- merge(cluster_table, select(hei_tweets_ratios, id, like_ratio, replies_ratio), by = "id", all=TRUE)
cluster_table <- cluster_table %>%
rename(total_like_ratio = like_ratio,
total_replies_ratio = replies_ratio)
# Joining attribute avg_views (average number of views), avg_likes (average number of likes) and avg_replies (average number of replies) per HEI
cluster_table <- merge(cluster_table, select(total_average_stats, id, avg_views, avg_likes, avg_replies), by = "id", all=TRUE)
# Renaming attribute like_ratio to avg_like_ratio and replies_ratio to avg_replies_ratio
cluster_table <- merge(cluster_table, select(hei_average_ratios, id, like_ratio, replies_ratio), by = "id", all=TRUE)
cluster_table <- cluster_table %>%
rename(avg_like_ratio = like_ratio,
avg_replies_ratio = replies_ratio)
print(cluster_table)
cluster_maker <- function(seed = 123, num_clusters = 3, table){
set.seed(123)
# Excluding id column for clustering
cluster_data <- select(table, -id)
# Scaling the data for kmeans method
scaled_data <- scale(cluster_data)
kmeans_result <- kmeans(scaled_data, centers = num_clusters)
print(kmeans_result$centers)
print(kmeans_result$cluster)
return(kmeans_result)
}
cluster_id_maker <- function(kmeans_result, table){
# Merging the cluster assignments with the original data
cluster_assignments <- data.frame(id = table$id, cluster = kmeans_result$cluster)
print(cluster_assignments)
plot(kmeans_result$cluster)
}
cluster_123_3 <- cluster_maker(table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.6314353 -0.9289798 -0.9124788 -0.9124788 -0.8237303 -0.8263586 -0.4559850 0.07640829
2 -0.4672956 1.2357198 1.2354314 1.2354314 1.1911790 1.2741657 2.0971401 1.93751241
3 0.1740030 -0.1713844 -0.1731538 -0.1731538 -0.1731809 -0.1913303 -0.4153661 -0.43904812
total_replies total_like_ratio total_replies_ratio avg_views avg_likes avg_replies avg_like_ratio avg_replies_ratio
1 0.5534067 2.0996788 3.0684868 -0.3467066 1.9304181 2.3142417 2.6030420 3.1020546
2 1.6810073 -0.7177677 -0.4777518 1.7460208 1.4388566 1.1599670 -0.6396304 -0.4169707
3 -0.4350468 -0.0737937 -0.2347759 -0.3494817 -0.5342368 -0.5149084 -0.1470868 -0.2520126
[1] 3 3 3 2 3 3 2 3 1 3 3 3
cluster_id_maker(cluster_123_3, table = cluster_table)
cluster_123_6 <- cluster_maker(num_clusters = 7, table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.54099099 -0.7980288 -0.7998764 -0.7998764 -0.8566746 -0.7686479 -0.4659280 -0.34870651
2 -0.33665384 2.0980363 2.1001655 2.1001655 2.2152351 2.2141794 1.6390693 2.76526795
3 2.06849496 0.3163906 0.3164848 0.3164848 0.3487604 0.3235393 -0.4385686 -0.45795609
4 -0.36345215 -0.5258824 -0.5295450 -0.5295450 -0.5074127 -0.5879623 -0.3946334 -0.54854585
5 -0.59793741 0.3734033 0.3706973 0.3706973 0.1671229 0.3341520 2.5552108 1.10975687
6 -0.63143530 -0.9289798 -0.9124788 -0.9124788 -0.8237303 -0.8263586 -0.4559850 0.07640829
7 -0.03517279 1.5243464 1.5265794 1.5265794 1.4868514 1.5200937 -0.3507682 -0.14392452
total_replies total_like_ratio total_replies_ratio avg_views avg_likes avg_replies avg_like_ratio avg_replies_ratio
1 -0.44336928 1.2911568 0.07105937 -0.3861273 -0.3579270 -0.4728730 0.8212504 -0.10327394
2 2.91587745 -0.6417247 -0.40143781 0.3936984 1.4654190 1.5919118 -0.5339425 -0.30190583
3 -0.46641612 -0.2251792 -0.13401717 -0.3929316 -0.6804893 -0.5991306 -0.2493630 -0.12958214
4 -0.51767840 -0.6173697 -0.47022920 -0.3036546 -0.5504837 -0.5144803 -0.5587944 -0.43018681
5 0.44613716 -0.7938108 -0.55406573 3.0983433 1.4122942 0.7280222 -0.7453183 -0.53203567
6 0.55340665 2.0996788 3.06848680 -0.3467066 1.9304181 2.3142417 2.6030420 3.10205460
7 -0.02513687 -0.3266195 -0.10615086 -0.3725990 -0.5293640 -0.4322473 -0.2323782 -0.08165369
[1] 4 4 1 2 3 4 5 4 6 3 1 7
cluster_id_maker(cluster_123_6, table = cluster_table)
cluster_123_6 <- cluster_maker(num_clusters = 5, table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.6314353 -0.9289798 -0.9124788 -0.9124788 -0.8237303 -0.8263586 -0.4559850 0.07640829
2 -0.3366538 2.0980363 2.1001655 2.1001655 2.2152351 2.2141794 1.6390693 2.76526795
3 1.3672724 0.7190425 0.7198496 0.7198496 0.7281241 0.7223908 -0.4093018 -0.35327890
4 -0.4226318 -0.6165979 -0.6196555 -0.6196555 -0.6238333 -0.6481908 -0.4183983 -0.48193274
5 -0.5979374 0.3734033 0.3706973 0.3706973 0.1671229 0.3341520 2.5552108 1.10975687
total_replies total_like_ratio total_replies_ratio avg_views avg_likes avg_replies avg_like_ratio avg_replies_ratio
1 0.5534067 2.09967875 3.0684868 -0.3467066 1.9304181 2.3142417 2.6030420 3.1020546
2 2.9158774 -0.64172468 -0.4014378 0.3936984 1.4654190 1.5919118 -0.5339425 -0.3019058
3 -0.3193230 -0.25899262 -0.1247284 -0.3861541 -0.6301142 -0.5435029 -0.2437014 -0.1136060
4 -0.4929087 0.01880576 -0.2897997 -0.3311455 -0.4862981 -0.5006112 -0.0987795 -0.3212159
5 0.4461372 -0.79381078 -0.5540657 3.0983433 1.4122942 0.7280222 -0.7453183 -0.5320357
[1] 4 4 4 2 3 4 5 4 1 3 4 3
cluster_id_maker(cluster_123_6, table = cluster_table)
cluster_123_3 <- cluster_maker(seed = 4855, num_clusters = 4, table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.6314353 -0.9289798 -0.9124788 -0.9124788 -0.8237303 -0.8263586 -0.4559850 0.07640829
2 -0.4672956 1.2357198 1.2354314 1.2354314 1.1911790 1.2741657 2.0971401 1.93751241
3 1.3672724 0.7190425 0.7198496 0.7198496 0.7281241 0.7223908 -0.4093018 -0.35327890
4 -0.4226318 -0.6165979 -0.6196555 -0.6196555 -0.6238333 -0.6481908 -0.4183983 -0.48193274
total_replies total_like_ratio total_replies_ratio avg_views avg_likes avg_replies avg_like_ratio avg_replies_ratio
1 0.5534067 2.09967875 3.0684868 -0.3467066 1.9304181 2.3142417 2.6030420 3.1020546
2 1.6810073 -0.71776773 -0.4777518 1.7460208 1.4388566 1.1599670 -0.6396304 -0.4169707
3 -0.3193230 -0.25899262 -0.1247284 -0.3861541 -0.6301142 -0.5435029 -0.2437014 -0.1136060
4 -0.4929087 0.01880576 -0.2897997 -0.3311455 -0.4862981 -0.5006112 -0.0987795 -0.3212159
[1] 4 4 4 2 3 4 2 4 1 3 4 3
cluster_id_maker(cluster_123_3, table = cluster_table)
cluster_123_6 <- cluster_maker(seed = 4855, num_clusters = 6, table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.5409910 -0.7980288 -0.7998764 -0.7998764 -0.8566746 -0.7686479 -0.4659280 -0.34870651
2 -0.3366538 2.0980363 2.1001655 2.1001655 2.2152351 2.2141794 1.6390693 2.76526795
3 1.3672724 0.7190425 0.7198496 0.7198496 0.7281241 0.7223908 -0.4093018 -0.35327890
4 -0.3634522 -0.5258824 -0.5295450 -0.5295450 -0.5074127 -0.5879623 -0.3946334 -0.54854585
5 -0.5979374 0.3734033 0.3706973 0.3706973 0.1671229 0.3341520 2.5552108 1.10975687
6 -0.6314353 -0.9289798 -0.9124788 -0.9124788 -0.8237303 -0.8263586 -0.4559850 0.07640829
total_replies total_like_ratio total_replies_ratio avg_views avg_likes avg_replies avg_like_ratio avg_replies_ratio
1 -0.4433693 1.2911568 0.07105937 -0.3861273 -0.3579270 -0.4728730 0.8212504 -0.1032739
2 2.9158774 -0.6417247 -0.40143781 0.3936984 1.4654190 1.5919118 -0.5339425 -0.3019058
3 -0.3193230 -0.2589926 -0.12472840 -0.3861541 -0.6301142 -0.5435029 -0.2437014 -0.1136060
4 -0.5176784 -0.6173697 -0.47022920 -0.3036546 -0.5504837 -0.5144803 -0.5587944 -0.4301868
5 0.4461372 -0.7938108 -0.55406573 3.0983433 1.4122942 0.7280222 -0.7453183 -0.5320357
6 0.5534067 2.0996788 3.06848680 -0.3467066 1.9304181 2.3142417 2.6030420 3.1020546
[1] 4 4 1 2 3 4 5 4 6 3 1 3
cluster_id_maker(cluster_123_6, table = cluster_table)